from IPython.display import HTML
HTML('''<script>
code_show=true;
function code_toggle() {
if (code_show){
$('div.input').hide();
} else {
$('div.input').show();
}
code_show = !code_show
}
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
import pandas as pd
import requests
import urllib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.mlab as mlab
import folium
import os
import csv
import json
import numpy as np
from bs4 import BeautifulSoup
print('Start JGR')
from sklearn.preprocessing import StandardScaler
from geopy.geocoders import Nominatim
from pandas.io.json import json_normalize
from sklearn.cluster import KMeans
When visiting New York, for sure you are going to look for something different to eat and try some international cuisine. But, if you don't know where to look and want to be sure that it is well rated, but not just because of likes in social media, then, there is a study from the Department of Health and Mental Hygiene from New York city, that grades the restaurants.
The final objective looks for the type of cuisine by Neighborhood and its grades, so you can visit with confidence the restaurant you liked.
The following Inspection Results from the DOHMH (Department of Health and Mental Hygiene from New York city), is defined in order to look for the best restaurants by type of cuisine by Borough.
Find below a small description from the DOHMH.
"The dataset contains every sustained or not yet adjudicated violation citation from every full or special program inspection conducted up to three years prior to the most recent inspection for restaurants and college cafeterias in an active status on the RECORD DATE (date of the data pull). When an inspection results in more than one violation, values for associated fields are repeated for each additional violation record. Establishments are uniquely identified by their CAMIS (record ID) number."
The data has been taken from the link below and saved as an csv file
The Dataframe has 384,487 rows and 18 columns, but it is required to do some clean up of the file to get just the information for the purpose of this analysis.
rest=pd.read_csv('New_York_Restaurant_Inspection_Results.csv')
rest.head()
print(rest.shape)
rest=rest.drop(['CAMIS','PHONE','BUILDING','INSPECTION DATE','ACTION','VIOLATION CODE','VIOLATION DESCRIPTION','CRITICAL FLAG','GRADE DATE','RECORD DATE','INSPECTION TYPE'], axis=1)
rest.head()
rest.shape
rest=rest.sort_values(['BORO','ZIPCODE','CUISINE DESCRIPTION','SCORE'], ascending=[True,True,True,False])
rest.head()
rest.shape
Now the Dataframe has been cleaned a little bit, but it can be improved. When some of the columns were deleted, then with the remaining ones, there are many duplicates, so those duplicates are going to be removed. So, from 384,487 rows, now there are 21,292 rows with cleaner data.
rest=rest.drop_duplicates(['DBA'], keep='last', inplace=False)
rest.head()
print(rest.shape)
We don't really want to have in our dataset those restaurants that haven't been rated yet, we do really want to know their grades, so those with NaN in their Scores, are going to be dropped.
So from those 21,292 rows, the dataset now has 12,154. A whole new different quantity from the amount of rows at the beginning.
rest=rest.dropna(axis=0, how='any')
rest.head()
rest.shape
rest.rename(columns = {'DBA':'RESTAURANT'}, inplace = True)
rest.rename(columns = {'BORO':'BOROUGH'}, inplace = True)
rest.rename(columns = {'CUISINE DESCRIPTION':'CUISINE'}, inplace = True)
print('rest.name Dataframe')
rest.head()
The last dataset, rest, has the Borough name and its ZIP codes, but we want to know a little more detail about the neighborhood and the geographical coordinates, so we can use the Foresquare information.
For this purpose, from the Department of Health of New York State, I copied the table, organized it in Excel and the converted it into a CSV file. It has some behind scenes manipulation in Excel to get it converted in this file.
https://www.health.ny.gov/statistics/cancer/registry/appendix/neighborhoods.htm
Below, you'll find the CSV file and how it was manipulated to get the ZIP codes by each Neighborhood, so they can be linked with the Restaurant Names, rest.names, Dataframe
nyzips=pd.read_csv('NY Bogorughs zip codes.csv')
nyzips.head()
nyzipsT=pd.melt(nyzips, id_vars=['Borough','Neighborhood'],value_vars=['zip1','zip2','zip3','zip4','zip5','zip6','zip7','zip8','zip9'])
nyzipsT.tail()
nyzipsT=nyzipsT.dropna(axis=0, how='any')
nyzipsT.tail()
nyzipsT.rename(columns = {'value':'ZIPCODE'}, inplace = True)
#nyzipsT=nyzipsT.drop(['variable'],axis=1)
nyzipsT.head()
Now we have the Borough name, the Neighborhood and the ZIP codes, let's look for the coordinates of each one.
From the US Census Bureau, Gazetteer Files I downloaded the file and save it as csv
ZIP Code Tabulation Areas https://www.census.gov/geographies/reference-files/2017/geo/gazetter-file.html
zip file https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2017_Gazetteer/2017_Gaz_zcta_national.zip
zipcoor=pd.read_csv('ZIP Codes and coordinates.csv')
zipcoor.head()
zipcoor.rename(columns={'GEOID':'ZIPCODE'},inplace=True)
zipcoor.rename(columns={'INTPTLAT':'LATITUDE'},inplace=True)
zipcoor.rename(columns={'INTPTLONG':'LONGITUDE'},inplace=True)
zipcoor=zipcoor.drop(['ALAND','AWATER','ALAND_SQMI','AWATER_SQMI'], axis=1)
zipcoor.head()
Now it's time to merge all the information.
dfout=pd.merge(nyzipsT,zipcoor)
dfout.head()
merged=pd.merge(dfout,rest)
merged.head()
Drop the columns that we don't want. It is twice the Borough column.
The variable column, was just a remaining column from the 'melting' process, but it is useless.
merged=merged.drop(['variable','BOROUGH'],axis=1)
Show the last dataframe, the one that's going to be used in the map.
Now it has 10,884 rows and 10 rearranged columns
merged.head()
merged.shape
address = 'New York City, NY'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of New York City are {}, {}.'.format(latitude, longitude))
# create map of New York using latitude and longitude values
map_newyork = folium.Map(location=[latitude, longitude], zoom_start=10)
# add markers to map
for lat, lng, borough, neighborhood in zip(merged['LATITUDE'], merged['LONGITUDE'], merged['Borough'], merged['Neighborhood']):
label = '{}, {}'.format(neighborhood, borough)
#label = folium.Popup(label, parse_html=True)
label = folium.Popup(label)
folium.CircleMarker(
[lat, lng],
radius=5,
popup=label,
color='blue',
fill=True,
fill_color='#3186cc',
fill_opacity=0.7,
#parse_html=False
).add_to(map_newyork)
map_newyork